CRMetrics class

Load the library

library(CRMetrics)
## Registered S3 methods overwritten by 'ggpp':
##   method                  from   
##   heightDetails.titleGrob ggplot2
##   widthDetails.titleGrob  ggplot2
library(magrittr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(cowplot)
library(ggplot2)
library(qs)
## qs 0.27.2. Announcement: https://github.com/qsbase/qs/issues/103

Load metadata

metadata <- read.table("metadata.tsv", sep = "\t", header = T)
crmc <- CRMetrics$new(data.path = "counts_premrna/", 
                     metadata = metadata, 
                     n.cores = 32)
crmc$selectMetrics()
##    no                                        metrics
## 1   1                      estimated number of cells
## 2   2                            mean reads per cell
## 3   3                          median genes per cell
## 4   4                                number of reads
## 5   5                                 valid barcodes
## 6   6                          sequencing saturation
## 7   7                           q30 bases in barcode
## 8   8                          q30 bases in rna read
## 9   9                      q30 bases in sample index
## 10 10                               q30 bases in umi
## 11 11                         reads mapped to genome
## 12 12             reads mapped confidently to genome
## 13 13 reads mapped confidently to intergenic regions
## 14 14   reads mapped confidently to intronic regions
## 15 15     reads mapped confidently to exonic regions
## 16 16      reads mapped confidently to transcriptome
## 17 17                 reads mapped antisense to gene
## 18 18                        fraction reads in cells
## 19 19                           total genes detected
## 20 20                     median umi counts per cell
crmc$plotSummaryMetrics(metrics = crmc$selectMetrics(ids = 2))$data$value %>% max
## Using 'sample' for 'comp.group'
## [1] 596564

CellBender

p <- crmc$prepareCellbender()
## 2024-12-04 22:57:12.993205 Started run using 30 cores
## 2024-12-04 22:57:12.996188 Loading HDF5 Cell Ranger outputs
## 2024-12-04 22:57:13.06574 Loading 30 count matrices using 30 cores
## 2024-12-04 22:59:09.213337 Done!
## 2024-12-04 22:59:09.214395 Using stored UMI counts calculations. To overwrite, set $cellbender$umi.counts <- NULL
## 2024-12-04 22:59:09.214982 Plotting
## 2024-12-04 22:59:09.68895 Done!
p

droplets <- crmc$getTotalDroplets()
droplets
##   CTRL_037   CTRL_039 CTRL_09051 CTRL_09055 CTRL_09057 CTRL_09148 CTRL_09155 
##      22110      18954      18699      18270      10263      30294      11430 
## CTRL_10055  CTRL_1467   CTRL_652   MSA_1352   MSA_1365   MSA_1371   MSA_1375 
##       4305       9966      11493      11769      10593       9537       5178 
##   MSA_1391   MSA_1398   MSA_1406   MSA_1436    PD_6900    PD_7044    PD_7050 
##       8610      10434       5418      19383      13344       9663       9549 
##    PD_7219    PD_7679    PD_7731    PD_7754    PD_7958    PD_8095   PD_K1376 
##      13050       3150      11664       9867      10083       1836      11964 
##   PD_K1411   PD_K1449 
##      17889      15852
droplets["CTRL_039"] <- 1.5e4 
droplets["CTRL_09051"] <- 2.5e4 
droplets["CTRL_09055"] <- 1.5e4
droplets["CTRL_09155"] <- 11430
droplets["CTRL_10055"] <- 1.5e4 
droplets["CTRL_1467"] <- 1.5e4 
droplets["CTRL_652"] <- 11493
droplets["MSA_1352"] <- 11769
droplets["MSA_1371"] <- 9537 
droplets["MSA_1375"] <- 5178
droplets["MSA_1406"] <- 1e4
droplets["PD_7679"] <- 5e3
droplets["PD_7754"] <- 1.5e4
droplets["PD_8095"] <- 3e3

subset <- c("CTRL_039","CTRL_09051","CTRL_09055","CTRL_09155","CTRL_10055","CTRL_1467","CTRL_652","MSA_1352","MSA_1371","MSA_1375","MSA_1406","PD_7679","PD_7754","PD_8095")

crmc$prepareCellbender(samples = subset, total.droplets = droplets[subset], verbose = F)

Save script

crmc$saveCellbenderScript(file = "cellbender_script.sh",
                          total_droplets = droplets)

Plot

p <- crmc$plotCbCells(samples = crmc$metadata$sample %>% as.character() %>% .[. != "MSA_1406"]) +
  scale_x_discrete(labels = c(paste("CTRL", seq(10)), paste("MSA", seq(7)), paste("PD", seq(12)))) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))

p

p <- crmc$plotCbTraining()

p

p <- crmc$plotCbCellProbs() +
  theme(line = element_blank(),
        axis.ticks.x = element_line())

p

crmc$plotCbAmbExp()
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

p <- crmc$plotCbAmbGenes() +
  theme(line = element_blank(),
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
        axis.ticks.x = element_line())

p

Add to cms object

crmc$addDetailedMetrics(cellbender = TRUE)

Plot detailed metrics

metrics.to.plot <- crmc$detailed.metrics$metric %>%
  unique()
crmc$plotDetailedMetrics(metrics = metrics.to.plot, plot.geom = "violin", comp.group = "flowcell")

Embed cells using Conos and UMAP

In order to plot our cells in a UMAP embedding, we need to perform preprocessing of the raw count matrices. To do this, either pagoda2 (default) or Seurat can be used.

crmc$doPreprocessing()

Then, we create the UMAP embedding using conos.

crmc$createEmbedding()

We can now plot our cells.

crmc$plotEmbedding()

Cell depth

We can plot cell depth, both in the UMAP embedding or as histograms for all samples or per sample.

p2a <- crmc$plotEmbedding(depth = TRUE, depth.cutoff = 5e2)
p2a

crmc$plotDepth(cutoff = 5e2)
## Warning: Removed 403 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 403 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 403 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 388 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 388 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 388 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 291 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 290 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 291 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 394 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 394 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 394 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 437 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 437 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 437 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 372 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 371 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 372 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 349 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 349 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 349 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 308 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 306 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 308 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 360 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 359 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 360 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 307 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 306 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 307 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 410 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 410 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 410 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 394 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 393 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 394 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 410 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 410 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 410 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 266 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 266 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 266 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 302 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 302 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 302 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 412 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 412 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 412 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 391 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 391 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 391 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 448 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 447 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 448 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 355 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 355 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 355 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 448 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 448 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 448 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 66 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 64 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 66 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 271 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 271 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 271 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 178 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 175 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 178 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 418 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 418 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 418 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 10 rows containing non-finite outside the scale range (`stat_align()`).
## Removed 10 rows containing non-finite outside the scale range (`stat_align()`).
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 290 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 290 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 290 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 347 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 347 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 347 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 462 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 462 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 462 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 323 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 323 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 323 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 154 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 154 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 154 rows containing missing values or values outside the scale range
## (`geom_line()`).

Doublets

Scrublet

crmc$detectDoublets(conda.path = "/opt/software/miniconda/4.12.0/condabin/conda")

We can plot the estimated doublets in the UMAP embedding.

p1 <- crmc$plotEmbedding(doublet.method = "scrublet")
p1

And we can plot the scores for the doublet estimations.

crmc$plotEmbedding(doublet.method = "scrublet", doublet.scores = TRUE)

DoubletDetection

crmc$detectDoublets(conda.path = "/opt/software/miniconda/4.12.0/condabin/conda", method = "doubletdetection")

We can plot the estimated doublets in the UMAP embedding.

p2 <- crmc$plotEmbedding(doublet.method = "doubletdetection")
p2

And we can plot the scores for the doublet estimations.

crmc$plotEmbedding(doublet.method = "doubletdetection", doublet.scores = TRUE)

Differences between methods

Bar plot per sample

scrub.res <- crmc$doublets$scrublet$result %>% 
  select(labels, sample) %>% 
  mutate(method = "scrublet")
dd.res <- crmc$doublets$doubletdetection$result %>% 
  select(labels, sample) %>% 
  mutate(labels = as.logical(labels), 
         method = "DoubletDetection")

dd.res[is.na(dd.res)] <- FALSE

plot.df <- rbind(scrub.res,
                 dd.res) %>% 
  filter(labels) %>% 
  group_by(sample, method) %>% 
  summarise(count = n())
## `summarise()` has grouped output by 'sample'. You can override using the
## `.groups` argument.
p5 <- ggplot(plot.df, aes(sample, count, fill = method)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
        legend.position = "bottom") +
  labs(x = "", y = "No. doublets", fill = "Method", title = "Doublets per sample") + 
  scale_fill_manual(values = c("blue","red"))
p5

Bar plot per method

p4 <- plot.df %>% 
  group_by(method) %>% 
  summarise(count = sum(count)) %>% 
  ggplot(aes(method, count, fill = method)) + 
  geom_bar(stat = "identity") +
  theme_bw() +
  guides(fill = "none") +
  labs(x = "", y = "No. doublets", title = "Total doublets per method") + 
  theme(legend.position = "bottom") + 
  scale_fill_manual(values = c("blue","red"))
p4

UMAP

plot.vec <- data.frame(scrublet = scrub.res$labels %>% as.numeric(), 
                       doubletdetection = dd.res$labels %>% as.numeric()) %>% 
  apply(1, \(x) if (x[1] == 0 & x[2] == 0) "Kept" else if (x[1] == 1 & x[2] == 0) "scrublet" else if (x[1] == 0 & x[2] == 1) "DoubletDetection" else "Both") %>% 
  setNames(rownames(scrub.res)) %>% 
  factor(levels = c("Kept","scrublet","DoubletDetection","Both"))

p3 <- crmc$con$plotGraph(groups = plot.vec, mark.groups = FALSE, show.legend = TRUE, shuffle.colors = TRUE, title = "Doublets", size = 0.3) +
  scale_color_manual(values = c("grey80","red","blue","black")) + 
  theme(legend.position = "bottom")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
p3

CTRL_039

plot.vec.ctrl039 <- plot.vec[grepl("CTRL_039", names(plot.vec))]

p6 <- crmc$con$plotGraph(groups = plot.vec.ctrl039, plot.na = FALSE, mark.groups = FALSE, show.legend = TRUE, shuffle.colors = TRUE, title = "Doublets, CTRL_039") +
  scale_color_manual(values = c("grey80","blue","red","black")) + 
  theme(legend.position = "bottom")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
p6

PD_7044

plot.vec.pd7044 <- plot.vec[grepl("PD_7044", names(plot.vec))]

p7 <- crmc$con$plotGraph(groups = plot.vec.pd7044, plot.na = FALSE, mark.groups = FALSE, show.legend = TRUE, shuffle.colors = TRUE, title = "Doublets, PD_7044") +
  scale_color_manual(values = c("grey80","red","blue","black")) + 
  theme(legend.position = "bottom")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
p7

Summarised plot

plot_grid(plotlist = list(p1,p2,p3,p4,p5,p6,p7), ncol = 3)

Mitochondrial fraction

We can also investigate the mitochondrial fraction in our cells

p1a <- crmc$plotEmbedding(mito.frac = TRUE)
p1a

Plot filtered cells

We can plot all the cells to be filtered in the UMAP embedding

crmc$plotFilteredCells(type = "embedding", depth.cutoff = 5e2, doublet_method = "scrublet")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

p3a <- crmc$plotFilteredCells(type = "embedding", depth.cutoff = 5e2, doublet_method = "doubletdetection") + 
  theme(legend.position = "bottom")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
p3a

And we can plot the cells to be filtered per sample

crmc$plotFilteredCells(type = "bar", doublet_method = "scrublet", depth.cutoff = 5e2)
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

p4a <- crmc$plotFilteredCells(type = "bar", doublet.method = "doubletdetection", depth.cutoff = 5e2) +
  theme(legend.position = "bottom")
p4a
## Warning: ggrepel: 22 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Summarized plot

plot_grid(plotlist = list(p1a,p2a,p3a,p4a), ncol = 2)
## Warning: ggrepel: 79 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

We can also extract the raw numbers for plotting in other ways than those included here

filter.data <- crmc$plotFilteredCells(type = "export", doublet.method = "doubletdetection")
filter.data %>% head()
## # A tibble: 6 × 4
##   sample   cell                         variable value
##   <chr>    <chr>                        <chr>    <dbl>
## 1 CTRL_037 CTRL_037!!GCACATATCCTAGCCT-1 depth        0
## 2 CTRL_037 CTRL_037!!GCACATATCCTAGCCT-1 mito         0
## 3 CTRL_037 CTRL_037!!GCACATATCCTAGCCT-1 doublets     0
## 4 CTRL_037 CTRL_037!!GATAGCTCATGTTTGG-1 depth        0
## 5 CTRL_037 CTRL_037!!GATAGCTCATGTTTGG-1 mito         0
## 6 CTRL_037 CTRL_037!!GATAGCTCATGTTTGG-1 doublets     0

Save filtered CMs

Finally, we can filter the count matrices and save them for downstream applications.

crmc$filterCms(prefix = "cms_filtered",
               method = "qs",
               depth.cutoff = 5e2, 
               mito.cutoff = 0.05, 
               doublets = "doubletdetection", 
               samples.to.exclude = "MSA_1406", 
               n.cores = 100)